# Load and inspect
library(tidyverse)
# read gz file directly
expr_raw <- read.delim("GSE76925_non-normalized (2).txt.gz",
header = TRUE,
sep = "\t",
check.names = FALSE)
dim(expr_raw)
[1] 47249 302
head(expr_raw[, 1:6])
NA
Here, sample1 (intensity), sample1.Detection.Pval (quality flag for that intensity).
# Split expression columns vs detection p-values What we did: We separated the dataset into: ->an expression matrix (expr_mat) ->a detection p-value matrix (detp_mat) Why we did it: Because detection p-values are not expression. They’re quality flags. Mixing them with expression breaks downstream analysis (PCA, DE, clustering).
# Expression columns are those NOT ending in ".Detection.Pval"
expr_mat <- expr_raw[, !grepl("Detection\\.Pval$", colnames(expr_raw)), drop = FALSE]
# Detection p-value columns DO end in ".Detection.Pval"
detp_mat <- expr_raw[, grepl("Detection\\.Pval$", colnames(expr_raw)), drop = FALSE]
# Sanity checks
dim(expr_mat); dim(detp_mat)
[1] 47249 151
[1] 47249 151
head(colnames(expr_mat), 6)
[1] "sample1" "sample2" "sample3" "sample4" "sample5" "sample6"
head(colnames(detp_mat), 6)
[1] "sample1.Detection.Pval" "sample2.Detection.Pval" "sample3.Detection.Pval" "sample4.Detection.Pval" "sample5.Detection.Pval"
[6] "sample6.Detection.Pval"
# Filter out probes that are mostly “not detected” What we did We kept only probes that are confidently detected in a decent fraction of samples.
Why we did it Microarrays include many probes that are basically noise in your tissue type. If you keep them:
->PCA becomes noisy ->differential expression has more false positives ->multiple testing burden explodes
# TRUE where probe is detected
detected <- detp_mat < 0.01
# Keep probes detected in at least 20% of samples
keep <- rowMeans(detected, na.rm = TRUE) >= 0.20
# Filter expression matrix
expr_f <- expr_mat[keep, , drop = FALSE]
dim(expr_f)
[1] 21618 151
# Log2 transform the intensities What we did We applied log2(x + 1) to the filtered expression matrix.
Why we did it Microarray intensity values are typically:
->right-skewed (a few probes have huge intensities) ->heteroscedastic (variance increases with mean)
Log-transforming:
->stabilises variance ->makes distributions more “normal-ish” ->improves PCA and linear modelling assumptions
expr_log <- log2(expr_f + 1)
summary(as.vector(expr_log))
Length Class Mode
sample1 21618 -none- numeric
sample2 21618 -none- numeric
sample3 21618 -none- numeric
sample4 21618 -none- numeric
sample5 21618 -none- numeric
sample6 21618 -none- numeric
sample7 21618 -none- numeric
sample8 21618 -none- numeric
sample9 21618 -none- numeric
sample10 21618 -none- numeric
sample11 21618 -none- numeric
sample12 21618 -none- numeric
sample13 21618 -none- numeric
sample14 21618 -none- numeric
sample15 21618 -none- numeric
sample16 21618 -none- numeric
sample17 21618 -none- numeric
sample18 21618 -none- numeric
sample19 21618 -none- numeric
sample20 21618 -none- numeric
sample21 21618 -none- numeric
sample22 21618 -none- numeric
sample23 21618 -none- numeric
sample24 21618 -none- numeric
sample25 21618 -none- numeric
sample26 21618 -none- numeric
sample27 21618 -none- numeric
sample28 21618 -none- numeric
sample29 21618 -none- numeric
sample30 21618 -none- numeric
sample31 21618 -none- numeric
sample32 21618 -none- numeric
sample33 21618 -none- numeric
sample34 21618 -none- numeric
sample35 21618 -none- numeric
sample36 21618 -none- numeric
sample37 21618 -none- numeric
sample38 21618 -none- numeric
sample39 21618 -none- numeric
sample40 21618 -none- numeric
sample41 21618 -none- numeric
sample42 21618 -none- numeric
sample43 21618 -none- numeric
sample44 21618 -none- numeric
sample45 21618 -none- numeric
sample46 21618 -none- numeric
sample47 21618 -none- numeric
sample48 21618 -none- numeric
sample49 21618 -none- numeric
sample50 21618 -none- numeric
sample51 21618 -none- numeric
sample52 21618 -none- numeric
sample53 21618 -none- numeric
sample54 21618 -none- numeric
sample55 21618 -none- numeric
sample56 21618 -none- numeric
sample57 21618 -none- numeric
sample58 21618 -none- numeric
sample59 21618 -none- numeric
sample60 21618 -none- numeric
sample61 21618 -none- numeric
sample62 21618 -none- numeric
sample63 21618 -none- numeric
sample64 21618 -none- numeric
sample65 21618 -none- numeric
sample66 21618 -none- numeric
sample67 21618 -none- numeric
sample68 21618 -none- numeric
sample69 21618 -none- numeric
sample70 21618 -none- numeric
sample71 21618 -none- numeric
sample72 21618 -none- numeric
sample73 21618 -none- numeric
sample74 21618 -none- numeric
sample75 21618 -none- numeric
sample76 21618 -none- numeric
sample77 21618 -none- numeric
sample78 21618 -none- numeric
sample79 21618 -none- numeric
sample80 21618 -none- numeric
sample81 21618 -none- numeric
sample82 21618 -none- numeric
sample83 21618 -none- numeric
sample84 21618 -none- numeric
sample85 21618 -none- numeric
sample86 21618 -none- numeric
sample87 21618 -none- numeric
sample88 21618 -none- numeric
sample89 21618 -none- numeric
sample90 21618 -none- numeric
sample91 21618 -none- numeric
sample92 21618 -none- numeric
sample93 21618 -none- numeric
sample94 21618 -none- numeric
sample95 21618 -none- numeric
sample96 21618 -none- numeric
sample97 21618 -none- numeric
sample98 21618 -none- numeric
sample99 21618 -none- numeric
sample100 21618 -none- numeric
sample101 21618 -none- numeric
sample102 21618 -none- numeric
sample103 21618 -none- numeric
sample104 21618 -none- numeric
sample105 21618 -none- numeric
sample106 21618 -none- numeric
sample107 21618 -none- numeric
sample108 21618 -none- numeric
sample109 21618 -none- numeric
sample110 21618 -none- numeric
sample111 21618 -none- numeric
sample112 21618 -none- numeric
sample113 21618 -none- numeric
sample114 21618 -none- numeric
sample115 21618 -none- numeric
sample116 21618 -none- numeric
sample117 21618 -none- numeric
sample118 21618 -none- numeric
sample119 21618 -none- numeric
sample120 21618 -none- numeric
sample121 21618 -none- numeric
sample122 21618 -none- numeric
sample123 21618 -none- numeric
sample124 21618 -none- numeric
sample125 21618 -none- numeric
sample126 21618 -none- numeric
sample127 21618 -none- numeric
sample128 21618 -none- numeric
sample129 21618 -none- numeric
sample130 21618 -none- numeric
sample131 21618 -none- numeric
sample132 21618 -none- numeric
sample133 21618 -none- numeric
sample134 21618 -none- numeric
sample135 21618 -none- numeric
sample136 21618 -none- numeric
sample137 21618 -none- numeric
sample138 21618 -none- numeric
sample139 21618 -none- numeric
sample140 21618 -none- numeric
sample141 21618 -none- numeric
sample142 21618 -none- numeric
sample143 21618 -none- numeric
sample144 21618 -none- numeric
sample145 21618 -none- numeric
sample146 21618 -none- numeric
sample147 21618 -none- numeric
sample148 21618 -none- numeric
sample149 21618 -none- numeric
sample150 21618 -none- numeric
sample151 21618 -none- numeric
#Attach real COPD vs Control labels What we need now Right now, you have expression values but no group labels (COPD vs control).To run proper COPD analysis (limma), we must fetch phenotype metadata from GEO and map it to your samples.
Why we need it Differential expression requires a design matrix like:
->COPD = 1 ->Control = 0 Optionally adjusted for age/sex/etc.
=> Without metadata, we can do unsupervised PCA, but not “COPD vs Control DE”.
What we’ll do next (conceptually)
->Download GEO metadata for GSE76925 ->Identify which samples are COPD vs control ->Align those labels with your column names ->Then run PCA + limma
###Quick revision summary (one-liners) =>Split expression vs Detection.Pval → because Detection.Pval is QC, not expression =>Filter low-detected probes → because many probes are noise and ruin analysis =>Log2 transform → because intensities are skewed and variance isn’t stable =>Fetch phenotype labels from GEO → because you need COPD/control labels for DE
#Get the real sample metadata from GEO What we’re doing We’re pulling the official GEO “phenotype” table for GSE76925 (the sample annotations).
Why we’re doing it Because differential expression needs labels (COPD vs control) and ideally covariates (age/sex/smoking). Your expression matrix currently has none of that.
library(GEOquery)
gse <- getGEO("GSE76925", GSEMatrix = TRUE)
eset <- gse[[1]]
pheno <- pData(eset)
dim(pheno)
[1] 151 58
colnames(pheno)[1:20]
[1] "title" "geo_accession" "status" "submission_date" "last_update_date"
[6] "type" "channel_count" "source_name_ch1" "organism_ch1" "characteristics_ch1"
[11] "characteristics_ch1.1" "characteristics_ch1.2" "characteristics_ch1.3" "characteristics_ch1.4" "characteristics_ch1.5"
[16] "characteristics_ch1.6" "characteristics_ch1.7" "characteristics_ch1.8" "characteristics_ch1.9" "characteristics_ch1.10"
head(pheno[, 1:10])
NA
#Step 1 — Confirm sample counts match
#What we are checking:If the number of expression columns equals number of metadata rows.
#Why:If they match, then order-based mapping is valid.
ncol(expr_log)
[1] 151
nrow(pheno)
[1] 151
#Step 2 — Map expression columns to GSM IDs
#What we are doing: We replace generic names (sample1) with actual GSM IDs.
#Why: So every expression column corresponds to real biological sample IDs.
colnames(expr_log) <- pheno$geo_accession
head(colnames(expr_log))
[1] "GSM2040792" "GSM2040793" "GSM2040794" "GSM2040795" "GSM2040796" "GSM2040797"
#Step 3 — Create disease status variable
#What we are doing: Extract case vs control from title.
#Why: Differential expression requires a design matrix.
group <- ifelse(grepl("case", pheno$title, ignore.case = TRUE),
"COPD", "Control")
table(group)
group
Control COPD
40 111
#Create a factor:
group <- factor(group, levels = c("Control", "COPD"))
# Step 4 — Why this is important?
cat("It ensured that expression columns were correctly mapped to GEO sample IDs before assigning disease status. Misalignment between phenotype and expression data is a common source of downstream analytical errors, so I verified counts and mapping before proceeding.")
It ensured that expression columns were correctly mapped to GEO sample IDs before assigning disease status. Misalignment between phenotype and expression data is a common source of downstream analytical errors, so I verified counts and mapping before proceeding.
# PCA (Unsupervised Check) What we are doing Principal Component Analysis on samples. Why ->Do COPD samples separate from controls? ->Are there outliers? ->Is there strong global structure? =>If PCA already separates groups, that’s powerful biological signal.
pca <- prcomp(t(expr_log), scale. = TRUE)
pca_df <- data.frame(
PC1 = pca$x[,1],
PC2 = pca$x[,2],
group = group
)
library(ggplot2)
ggplot(pca_df, aes(PC1, PC2, color = group)) +
geom_point(size = 4, alpha = 0.8) +
stat_ellipse(aes(fill = group), geom = "polygon", alpha = 0.1, linetype = "dashed") +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16), # centered title
axis.title = element_text(face = "bold"), # bold axis labels
legend.title = element_text(face = "bold"), # bold legend title
legend.position = "right",
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "PCA of Lung Tissue Expression (GSE76925)",
x = "PC1",
y = "PC2",
color = "Group"
)
cat("The PCA shows partial separation but substantial overlap between COPD and controls, suggesting that while disease-related transcriptional shifts exist, COPD exhibits considerable molecular heterogeneity. The increased dispersion in COPD samples may reflect underlying biological subtypes.”")
The PCA shows partial separation but substantial overlap between COPD and controls, suggesting that while disease-related transcriptional shifts exist, COPD exhibits considerable molecular heterogeneity. The increased dispersion in COPD samples may reflect underlying biological subtypes.”
summary(pca)$importance[2, 1:5]
PC1 PC2 PC3 PC4 PC5
0.64029 0.08009 0.04342 0.02154 0.01908
cat("\n\nAlthough COPD and control samples show partial separation, the dominant variance axis (PC1, ~64%) does not appear to correspond directly to disease status. This suggests that other biological or technical factors contribute substantially to transcriptional variability. The broader dispersion among COPD samples may reflect disease heterogeneity or differences in inflammatory or tissue-remodeling signatures.")
Although COPD and control samples show partial separation, the dominant variance axis (PC1, ~64%) does not appear to correspond directly to disease status. This suggests that other biological or technical factors contribute substantially to transcriptional variability. The broader dispersion among COPD samples may reflect disease heterogeneity or differences in inflammatory or tissue-remodeling signatures.
I re-ran PCA without gene scaling because z-scaling across genes forces each probe to have equal variance, which can over-weight low-variance or noisy probes in transcriptomic data. For microarray expression analysis, log-transformation is typically sufficient, and PCA without scaling preserves the natural variance structure across genes.
pca_noscale <- prcomp(t(expr_log), center = TRUE, scale. = FALSE)
pca_df2 <- data.frame(
PC1 = pca_noscale$x[,1],
PC2 = pca_noscale$x[,2],
group = group
)
summary(pca_noscale)$importance[2, 1:5]
PC1 PC2 PC3 PC4 PC5
0.60078 0.11859 0.04323 0.02830 0.02105
library(ggplot2)
# (Recreate the plotting data frame if needed)
pca_df2 <- data.frame(
PC1 = pca_noscale$x[, 1],
PC2 = pca_noscale$x[, 2],
group = group
)
ggplot(pca_df2, aes(PC1, PC2, color = group)) +
geom_point(size = 3, alpha = 0.9) +
stat_ellipse(aes(fill = group), geom = "polygon", alpha = 0.12, linetype = 2) +
theme_minimal() +
labs(title = "PCA (no scaling) — GSE76925", x = "PC1", y = "PC2")
That tells us whether the dominant variance axis reflects disease.
#Step 1 — Extract PC1 scores
pca_noscale$x
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
GSM2040792 -223.0394742 -19.599722 -17.78976603 21.1877203 -5.41285831 12.41129155 -0.5757842 28.8975034 -3.3520886 0.698014533
GSM2040793 -199.5488581 -2.381686 -9.93519624 17.3324375 -2.23032752 -3.87951608 6.6563653 32.3377786 11.1979481 1.978181334
GSM2040794 -129.8319451 10.441590 -39.92298499 15.6619854 -3.07919880 -1.37762020 8.1873821 8.7577886 8.5011933 3.210168256
GSM2040795 -103.5717446 16.867693 -41.98749753 19.7895646 4.96204725 -7.54317450 -0.9454274 3.4786327 10.3214616 6.191604603
GSM2040796 12.0887928 49.626460 -39.89070684 -37.5391431 7.60760955 9.46924348 4.7438525 -18.2315657 -17.7951398 -12.855632104
GSM2040797 -16.6780162 28.920605 -74.45247082 -32.4864260 27.40375485 9.69582375 15.7063632 -16.1753738 -14.3982716 5.747575132
PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20
GSM2040792 -6.964184931 13.16275175 -6.78179411 4.3547080 -1.72552471 4.36078220 -1.19779898 2.7962572 -8.98329178 1.58783049
GSM2040793 -6.316322434 16.18203194 -5.70463842 -5.5475550 0.04099332 -2.43094383 -6.01106974 6.9977513 7.52916874 3.51204564
GSM2040794 -6.343085719 7.18374575 -2.45475300 -4.2616474 3.54245273 -0.86627474 5.85507247 8.4080377 12.73785136 -0.71596649
GSM2040795 -4.064374444 9.53698650 -2.18278489 -13.0320417 -3.52157416 2.70603689 7.67816479 2.7695716 10.22666619 -11.31708910
GSM2040796 2.318636723 7.31805735 14.21838044 -1.9093388 2.86787890 -6.40548426 -5.02494785 2.2478858 -6.05577832 -6.20806825
GSM2040797 -8.060814514 -1.11322604 9.17711497 -1.5310042 -6.63528819 -2.40868345 -6.65306759 -0.5345537 -7.66526184 -6.73156219
PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
GSM2040792 -5.21537275 2.57643352 -0.18389759 -9.01028437 6.92094529 1.0535765 -2.1767059 -12.47227785 1.72996940 3.54105186
GSM2040793 -1.34258514 -3.95141980 -8.26827114 4.95149390 -1.15207082 7.4289010 0.9532863 8.60773066 4.78343565 1.54522680
GSM2040794 3.43687330 -2.47378056 -6.00654294 12.00856778 -0.07126944 7.8029910 6.9810852 10.62928296 4.05796516 -2.46149616
GSM2040795 2.84004249 -0.22249572 4.14302247 10.79961290 7.82168388 -3.2390833 3.7510895 5.24250322 2.47796914 -7.26330711
GSM2040796 4.88403472 -22.37891523 10.18140958 3.32169407 -0.13153712 6.9892522 -6.0368600 -4.55951014 -7.56911602 -10.53234926
GSM2040797 6.16472454 -3.87783175 -2.82125180 3.69454919 10.38104336 -0.9084373 3.1687313 0.51696037 4.76964696 -4.83167141
PC31 PC32 PC33 PC34 PC35 PC36 PC37 PC38 PC39
GSM2040792 1.450448984 -4.03541238 6.426956042 -1.03518513 5.81165474 -6.94838217 6.42675806 -1.155170719 2.508348e-01
GSM2040793 -0.588218567 -0.75124922 7.470638723 8.32534550 8.85184746 0.21683157 -2.98150033 -4.874003795 4.885952e+00
GSM2040794 6.195169816 -0.18465957 7.003341281 4.94702371 3.99622646 2.21686580 0.35488828 -4.431313852 2.061601e+00
GSM2040795 7.405718511 8.93637423 -9.430091031 4.17259195 3.56030480 2.03627902 -0.06295327 -1.236526451 1.057598e+00
GSM2040796 -7.898998747 6.27591846 2.514484478 8.95642041 -2.89610074 -1.20931211 6.30999210 -6.490330919 -1.406954e+00
GSM2040797 -2.848611859 -1.77660825 -1.005077067 1.91394755 -1.79967364 4.37454374 3.38407123 -8.358923799 -8.631874e+00
PC40 PC41 PC42 PC43 PC44 PC45 PC46 PC47 PC48 PC49
GSM2040792 -2.70092900 0.63114569 -4.0302322 5.79038421 -1.092421117 3.569528311 -0.21015248 -3.75016209 0.98208324 -2.96138692
GSM2040793 6.54396081 1.56223971 0.7975053 4.87920483 -6.694319873 3.983891225 1.83009237 -1.97748440 -0.82453667 -3.27282909
GSM2040794 1.08095948 3.97464232 -0.1563169 1.79280186 -5.510137326 1.644855960 1.57034010 -2.76328666 -0.84224421 -4.00172063
GSM2040795 3.29366524 7.70588207 -2.7213449 0.41043072 -3.867252383 -5.895224835 2.62141871 3.54179922 2.54554908 -0.24514715
GSM2040796 -0.91042971 -2.10792206 -2.0939545 7.86325113 -3.683125087 -6.101567347 0.05983379 3.95271892 -3.63273896 7.03751407
GSM2040797 -1.12277201 -0.33131389 2.4197665 -4.03518149 1.764567865 5.618108831 3.20371130 -5.40730120 4.67821864 -2.35548618
PC50 PC51 PC52 PC53 PC54 PC55 PC56 PC57 PC58 PC59
GSM2040792 -1.20285400 8.61392478 -7.016808557 -4.42393681 0.75140156 7.921483936 -0.65870057 -1.35392893 -6.3279946 2.390932592
GSM2040793 -2.92763209 -0.93485108 4.986744545 0.78388679 0.26717164 -0.356511441 -2.46282714 -0.53501628 4.1231611 4.616589904
GSM2040794 0.71132272 -1.20180963 1.371507456 1.33480705 1.22913973 4.352888661 -3.36052470 -2.01667366 2.1701272 6.148381366
GSM2040795 -3.42242482 -0.49419086 -3.794613687 2.20802064 2.94062886 -2.878448396 0.20947852 -3.53061491 5.4781342 3.179681998
GSM2040796 -4.73856032 -3.23378317 -0.826036144 2.18887554 -0.62619934 -0.046656395 1.16730942 1.62281420 -0.2995751 -5.234851576
GSM2040797 2.96889943 4.70831369 4.081270302 -2.10574692 4.40648525 1.291896666 -0.34114845 -4.58096322 4.7638330 -0.665624665
PC60 PC61 PC62 PC63 PC64 PC65 PC66 PC67 PC68 PC69
GSM2040792 4.19128602 -1.61176918 -1.75040430 -4.6588828496 0.83215521 0.068052981 2.34139919 -4.19571334 -0.89927766 -1.40673572
GSM2040793 1.93241326 1.95748706 4.16013027 2.5754698539 1.92842962 -0.009762846 -0.47552914 -3.82168008 -0.95632919 -3.26851516
GSM2040794 1.46827887 -1.94412979 4.07739355 2.5874953951 -1.06118717 -0.539764079 -1.79844479 -1.52922741 -0.83548659 2.79680380
GSM2040795 0.96267069 -0.77104645 -5.71158394 1.5039959060 -1.04638515 -1.151205745 2.88470565 7.63975457 0.83812712 -3.25930027
GSM2040796 0.88896501 -4.83382781 5.02117325 3.9752705193 7.45920093 0.656192869 0.89770330 -4.77158703 -1.39328851 -3.43867983
GSM2040797 -1.10941221 1.40905273 1.17064567 -6.3193567763 -3.31548116 -1.515263529 0.37838083 3.61193470 2.07866160 9.01009885
PC70 PC71 PC72 PC73 PC74 PC75 PC76 PC77 PC78 PC79
GSM2040792 -0.26347050 -0.389126192 -0.44213109 1.13055431 4.806304373 -3.128780030 -1.13922760 -3.26695292 -2.41456162 -4.921598463
GSM2040793 -2.64389041 -0.533748486 -1.26644945 -2.39199172 1.039367016 3.037619885 -1.80491779 -0.93885756 -4.09845662 2.881126901
GSM2040794 1.05001383 -0.761474499 -0.04198317 1.64197465 0.930195796 -4.391029801 2.13254016 -2.42126798 2.10243934 -0.029456247
GSM2040795 3.51302162 2.433177649 3.94509797 1.66007034 -1.796480269 -0.187367691 1.24883240 0.77722510 6.21053670 -2.041350212
GSM2040796 0.54993291 0.537027963 -2.19561426 -3.96703207 3.219152144 -1.443878902 1.57589196 -6.77609267 -0.52015914 -0.106125392
GSM2040797 1.72779540 -2.525183110 -4.60572750 1.50872746 1.240150743 2.328891024 -4.03371439 1.01721967 -4.53136037 2.616590118
PC80 PC81 PC82 PC83 PC84 PC85 PC86 PC87 PC88 PC89
GSM2040792 2.94940047 1.45065969 4.25510381 2.70134207 2.27196058 -4.6619410112 -0.51905734 -3.10468132 -1.91065331 0.26814515
GSM2040793 1.12390927 2.77943714 -0.38901438 -4.91285331 -3.71230463 1.8708272239 -2.51874412 0.56731580 1.28551201 0.58711482
GSM2040794 -1.19752553 0.72562919 -2.21676001 -1.24117406 -2.37514901 0.4501905268 1.01897916 0.53685941 -0.13102319 -0.24540684
GSM2040795 0.98377171 -0.37882791 -2.86254183 5.39864412 4.83689927 -6.1624890934 -3.53520242 -1.50021807 0.07693095 2.69542184
GSM2040796 -1.77326079 -3.71539372 -1.03657806 2.66402383 0.26403498 -3.4230130626 2.88053217 -2.25152995 1.38739592 1.30579876
GSM2040797 1.97516902 -0.60642179 2.14369239 -2.89021922 0.76218166 0.8385516984 -0.48562018 -0.23581874 1.69989507 -0.21632598
PC90 PC91 PC92 PC93 PC94 PC95 PC96 PC97 PC98 PC99
GSM2040792 0.150785962 5.35024617 1.238598889 1.51962386 1.32827505 -0.61698971 -4.694927454 -1.63307930 -3.354701233 -0.820930771
GSM2040793 -0.496032018 -1.91041616 -0.595014571 -0.43531186 0.66849683 -4.78154786 -0.312459142 3.33582981 3.390999033 2.508644894
GSM2040794 -1.926625527 2.09296747 1.735039274 -0.56890228 -2.59441477 4.50755540 1.630598052 1.54893177 -1.604464745 -0.191274205
GSM2040795 -0.006653669 1.23685232 -1.425608464 0.56957728 1.93960905 0.36094282 -3.261137318 -2.10517125 -0.771206239 -2.258108346
GSM2040796 0.986391128 0.52209776 0.533669346 3.07016874 1.26500556 -3.06613183 0.087194941 -0.20593220 -0.993841377 1.191137954
GSM2040797 2.245443500 -1.75433043 -2.502589786 1.32172803 -0.49885017 1.64105082 -2.244264297 -6.05185155 -2.797556778 -3.248640961
PC100 PC101 PC102 PC103 PC104 PC105 PC106 PC107 PC108 PC109
GSM2040792 1.49803201 3.217761801 1.348402535 0.43250281 2.15621172 0.767011659 -2.33475658 -1.42594421 -2.534271702 -1.09162947
GSM2040793 1.37079125 -4.920532908 -1.082084794 -0.29062168 0.56087870 -0.177253835 -0.07545270 -2.60172704 0.986991969 2.70454442
GSM2040794 -2.75536720 -0.182093099 -0.005725923 -0.91370932 -3.52212402 1.595803904 -1.07059681 -1.13292000 1.525563082 -1.15144364
GSM2040795 -0.54730007 -0.167104147 -0.677983459 2.23549166 2.01120222 1.262658204 -0.92563688 -1.21511452 -2.054460298 0.28996766
GSM2040796 -1.74404176 -2.013743836 0.476091287 -1.90401924 -1.84090672 0.484518746 1.69598539 -0.51832157 1.840808253 -1.99119246
GSM2040797 1.25502680 -2.693639149 3.139903325 3.07645072 -0.76372214 0.448582227 0.39474253 -0.14867615 3.904927379 4.97813687
PC110 PC111 PC112 PC113 PC114 PC115 PC116 PC117 PC118 PC119
GSM2040792 -2.81067638 -1.730405644 -0.30041646 0.03215978 4.3988898016 0.32888361 0.26548391 2.2295662120 -2.87510240 -0.69961408
GSM2040793 -0.41453832 1.438990450 -2.94153128 2.63979176 1.1462514423 -0.26291984 3.03015323 0.1418160182 0.28589403 -1.73602474
GSM2040794 -0.16983819 0.741169934 1.16767985 -2.01462841 -0.3601207738 -0.01058388 0.26274042 0.5714098824 0.68516101 4.58154707
GSM2040795 1.98200264 -0.972256984 0.50136930 -0.87597409 -1.3897920958 -0.97891321 -2.11043161 -2.4378800144 -0.24452715 -3.50653730
GSM2040796 0.79548922 -2.818500397 0.43467528 2.08477616 0.4253228914 0.11470259 -0.08901078 -0.7697083429 -0.57899397 0.61388726
GSM2040797 2.73738163 0.502888382 1.38042997 1.69540793 1.4057161883 1.32974203 -1.43833975 0.6146140825 -2.56467885 1.49616738
PC120 PC121 PC122 PC123 PC124 PC125 PC126 PC127 PC128 PC129
GSM2040792 -0.76863504 -0.05381478 -1.58223700 -0.41196309 -0.31160382 -1.07570096 2.30714835 1.50267352 0.41021709 1.122126930
GSM2040793 3.86936444 -0.90889618 0.59623781 1.61749995 -1.33862361 -1.95652754 -1.35066279 -1.41496757 -1.84549177 -4.583255974
GSM2040794 -1.43708033 1.35646446 -0.99059507 0.52125723 -0.03147995 0.63482666 0.54297132 -0.05615791 4.72600899 6.720015114
GSM2040795 -3.80902710 1.21850498 -0.68636333 1.65945191 0.92111640 0.12755215 -1.09636829 0.46975912 -1.61691615 -1.747402359
GSM2040796 1.12549255 -1.93336254 -1.10816758 -0.94290299 0.99973915 -0.47709612 0.73071503 0.88063018 0.26715576 0.366515884
GSM2040797 1.26560977 0.23331317 0.92893860 0.97265047 -1.83542895 1.00191092 -0.32009314 2.26290168 -0.97079916 -0.688989960
PC130 PC131 PC132 PC133 PC134 PC135 PC136 PC137 PC138 PC139
GSM2040792 0.05243377 1.719511088 0.92849005 -2.41015983 -1.58230720 -1.873445326 1.556663389 2.341496504 0.5409756322 1.107048448
GSM2040793 3.68476720 -1.188416751 -2.06667566 0.35305297 0.35051252 -0.545775822 -1.508882395 -0.475946880 1.4095726465 4.430376940
GSM2040794 -3.98241389 1.546582704 1.72536637 -1.97355632 -1.44126364 3.373380563 1.043928675 -1.565442623 -0.7701613743 -6.742326725
GSM2040795 1.90687166 -2.651946683 2.62416316 1.30311240 -1.07012465 -3.032178789 0.296002652 0.786847157 -1.7186375905 3.156957470
GSM2040796 1.64981959 -0.033204956 0.54512481 0.89359139 -0.23015435 0.039849359 0.830302054 -0.527528446 -0.3663443956 -0.209927867
GSM2040797 0.98662215 -1.129994025 -1.81917664 -0.50749356 0.46235582 -0.681593549 1.067604775 0.077288856 0.0581103928 -0.134045443
PC140 PC141 PC142 PC143 PC144 PC145 PC146 PC147 PC148 PC149
GSM2040792 0.18215091 -2.17447518 -1.09930366 0.492231126 0.009040347 0.33278162 -0.568080735 0.57965511 -0.21585714 0.497396786
GSM2040793 2.08637237 0.31218333 0.41750846 2.782575306 0.478427320 1.25062191 -3.815725639 0.28625798 0.00884119 0.076060482
GSM2040794 -1.77022914 -0.67808501 0.18961429 -3.954743351 -3.479171264 -2.00247052 5.142802250 -0.52076025 0.67425691 0.636811379
GSM2040795 -1.18386750 2.03517489 0.08120949 -0.579022448 1.328282550 1.56928843 -1.142621862 0.81543883 0.24435047 0.459989874
GSM2040796 0.10281617 0.55787360 -0.65440099 0.254577422 -0.191324042 -0.23692024 0.745416084 -0.25460850 1.02182318 0.004068692
GSM2040797 1.54842975 -0.79174253 0.44800061 -2.053983481 2.192225730 -1.62562249 -0.582344326 -0.41507597 0.05489578 -0.291139968
PC150 PC151
GSM2040792 -1.918155e-02 -4.648163e-13
GSM2040793 3.986442e-02 -4.722894e-13
GSM2040794 -5.539206e-02 -4.719866e-13
GSM2040795 -3.648964e-02 -4.682413e-13
GSM2040796 -1.134902e-01 -4.329052e-13
GSM2040797 1.299697e-01 -4.453689e-13
[ reached getOption("max.print") -- omitted 145 rows ]
pc1_scores <- pca_noscale$x[, 1] #So extract PC1:
#Step 2 — Compare PC1 between groups
#We do a simple t-test, Why a t-test? Because: PC1 is continuous, group is binary (COPD vs Control), we are testing mean difference
t.test(pc1_scores ~ group)
Welch Two Sample t-test
data: pc1_scores by group
t = 0.1672, df = 63.565, p-value = 0.8677
alternative hypothesis: true difference in means between group Control and group COPD is not equal to 0
95 percent confidence interval:
-31.22036 36.92301
sample estimates:
mean in group Control mean in group COPD
2.0960079 -0.7553182
#Step 3 — Visualize it
library(ggplot2)
df_pc1 <- data.frame(
PC1 = pc1_scores,
group = group
)
ggplot(df_pc1, aes(group, PC1, fill = group)) +
geom_boxplot(alpha = 0.6) +
theme_minimal() +
labs(title = "PC1 Distribution by Disease Status")
#Step 1 — Extract PC1 scores
pca_noscale$x
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
GSM2040792 -223.0394742 -19.599722 -17.78976603 21.1877203 -5.41285831 12.41129155 -0.5757842 28.8975034 -3.3520886 0.698014533
GSM2040793 -199.5488581 -2.381686 -9.93519624 17.3324375 -2.23032752 -3.87951608 6.6563653 32.3377786 11.1979481 1.978181334
GSM2040794 -129.8319451 10.441590 -39.92298499 15.6619854 -3.07919880 -1.37762020 8.1873821 8.7577886 8.5011933 3.210168256
GSM2040795 -103.5717446 16.867693 -41.98749753 19.7895646 4.96204725 -7.54317450 -0.9454274 3.4786327 10.3214616 6.191604603
GSM2040796 12.0887928 49.626460 -39.89070684 -37.5391431 7.60760955 9.46924348 4.7438525 -18.2315657 -17.7951398 -12.855632104
GSM2040797 -16.6780162 28.920605 -74.45247082 -32.4864260 27.40375485 9.69582375 15.7063632 -16.1753738 -14.3982716 5.747575132
PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20
GSM2040792 -6.964184931 13.16275175 -6.78179411 4.3547080 -1.72552471 4.36078220 -1.19779898 2.7962572 -8.98329178 1.58783049
GSM2040793 -6.316322434 16.18203194 -5.70463842 -5.5475550 0.04099332 -2.43094383 -6.01106974 6.9977513 7.52916874 3.51204564
GSM2040794 -6.343085719 7.18374575 -2.45475300 -4.2616474 3.54245273 -0.86627474 5.85507247 8.4080377 12.73785136 -0.71596649
GSM2040795 -4.064374444 9.53698650 -2.18278489 -13.0320417 -3.52157416 2.70603689 7.67816479 2.7695716 10.22666619 -11.31708910
GSM2040796 2.318636723 7.31805735 14.21838044 -1.9093388 2.86787890 -6.40548426 -5.02494785 2.2478858 -6.05577832 -6.20806825
GSM2040797 -8.060814514 -1.11322604 9.17711497 -1.5310042 -6.63528819 -2.40868345 -6.65306759 -0.5345537 -7.66526184 -6.73156219
PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
GSM2040792 -5.21537275 2.57643352 -0.18389759 -9.01028437 6.92094529 1.0535765 -2.1767059 -12.47227785 1.72996940 3.54105186
GSM2040793 -1.34258514 -3.95141980 -8.26827114 4.95149390 -1.15207082 7.4289010 0.9532863 8.60773066 4.78343565 1.54522680
GSM2040794 3.43687330 -2.47378056 -6.00654294 12.00856778 -0.07126944 7.8029910 6.9810852 10.62928296 4.05796516 -2.46149616
GSM2040795 2.84004249 -0.22249572 4.14302247 10.79961290 7.82168388 -3.2390833 3.7510895 5.24250322 2.47796914 -7.26330711
GSM2040796 4.88403472 -22.37891523 10.18140958 3.32169407 -0.13153712 6.9892522 -6.0368600 -4.55951014 -7.56911602 -10.53234926
GSM2040797 6.16472454 -3.87783175 -2.82125180 3.69454919 10.38104336 -0.9084373 3.1687313 0.51696037 4.76964696 -4.83167141
PC31 PC32 PC33 PC34 PC35 PC36 PC37 PC38 PC39
GSM2040792 1.450448984 -4.03541238 6.426956042 -1.03518513 5.81165474 -6.94838217 6.42675806 -1.155170719 2.508348e-01
GSM2040793 -0.588218567 -0.75124922 7.470638723 8.32534550 8.85184746 0.21683157 -2.98150033 -4.874003795 4.885952e+00
GSM2040794 6.195169816 -0.18465957 7.003341281 4.94702371 3.99622646 2.21686580 0.35488828 -4.431313852 2.061601e+00
GSM2040795 7.405718511 8.93637423 -9.430091031 4.17259195 3.56030480 2.03627902 -0.06295327 -1.236526451 1.057598e+00
GSM2040796 -7.898998747 6.27591846 2.514484478 8.95642041 -2.89610074 -1.20931211 6.30999210 -6.490330919 -1.406954e+00
GSM2040797 -2.848611859 -1.77660825 -1.005077067 1.91394755 -1.79967364 4.37454374 3.38407123 -8.358923799 -8.631874e+00
PC40 PC41 PC42 PC43 PC44 PC45 PC46 PC47 PC48 PC49
GSM2040792 -2.70092900 0.63114569 -4.0302322 5.79038421 -1.092421117 3.569528311 -0.21015248 -3.75016209 0.98208324 -2.96138692
GSM2040793 6.54396081 1.56223971 0.7975053 4.87920483 -6.694319873 3.983891225 1.83009237 -1.97748440 -0.82453667 -3.27282909
GSM2040794 1.08095948 3.97464232 -0.1563169 1.79280186 -5.510137326 1.644855960 1.57034010 -2.76328666 -0.84224421 -4.00172063
GSM2040795 3.29366524 7.70588207 -2.7213449 0.41043072 -3.867252383 -5.895224835 2.62141871 3.54179922 2.54554908 -0.24514715
GSM2040796 -0.91042971 -2.10792206 -2.0939545 7.86325113 -3.683125087 -6.101567347 0.05983379 3.95271892 -3.63273896 7.03751407
GSM2040797 -1.12277201 -0.33131389 2.4197665 -4.03518149 1.764567865 5.618108831 3.20371130 -5.40730120 4.67821864 -2.35548618
PC50 PC51 PC52 PC53 PC54 PC55 PC56 PC57 PC58 PC59
GSM2040792 -1.20285400 8.61392478 -7.016808557 -4.42393681 0.75140156 7.921483936 -0.65870057 -1.35392893 -6.3279946 2.390932592
GSM2040793 -2.92763209 -0.93485108 4.986744545 0.78388679 0.26717164 -0.356511441 -2.46282714 -0.53501628 4.1231611 4.616589904
GSM2040794 0.71132272 -1.20180963 1.371507456 1.33480705 1.22913973 4.352888661 -3.36052470 -2.01667366 2.1701272 6.148381366
GSM2040795 -3.42242482 -0.49419086 -3.794613687 2.20802064 2.94062886 -2.878448396 0.20947852 -3.53061491 5.4781342 3.179681998
GSM2040796 -4.73856032 -3.23378317 -0.826036144 2.18887554 -0.62619934 -0.046656395 1.16730942 1.62281420 -0.2995751 -5.234851576
GSM2040797 2.96889943 4.70831369 4.081270302 -2.10574692 4.40648525 1.291896666 -0.34114845 -4.58096322 4.7638330 -0.665624665
PC60 PC61 PC62 PC63 PC64 PC65 PC66 PC67 PC68 PC69
GSM2040792 4.19128602 -1.61176918 -1.75040430 -4.6588828496 0.83215521 0.068052981 2.34139919 -4.19571334 -0.89927766 -1.40673572
GSM2040793 1.93241326 1.95748706 4.16013027 2.5754698539 1.92842962 -0.009762846 -0.47552914 -3.82168008 -0.95632919 -3.26851516
GSM2040794 1.46827887 -1.94412979 4.07739355 2.5874953951 -1.06118717 -0.539764079 -1.79844479 -1.52922741 -0.83548659 2.79680380
GSM2040795 0.96267069 -0.77104645 -5.71158394 1.5039959060 -1.04638515 -1.151205745 2.88470565 7.63975457 0.83812712 -3.25930027
GSM2040796 0.88896501 -4.83382781 5.02117325 3.9752705193 7.45920093 0.656192869 0.89770330 -4.77158703 -1.39328851 -3.43867983
GSM2040797 -1.10941221 1.40905273 1.17064567 -6.3193567763 -3.31548116 -1.515263529 0.37838083 3.61193470 2.07866160 9.01009885
PC70 PC71 PC72 PC73 PC74 PC75 PC76 PC77 PC78 PC79
GSM2040792 -0.26347050 -0.389126192 -0.44213109 1.13055431 4.806304373 -3.128780030 -1.13922760 -3.26695292 -2.41456162 -4.921598463
GSM2040793 -2.64389041 -0.533748486 -1.26644945 -2.39199172 1.039367016 3.037619885 -1.80491779 -0.93885756 -4.09845662 2.881126901
GSM2040794 1.05001383 -0.761474499 -0.04198317 1.64197465 0.930195796 -4.391029801 2.13254016 -2.42126798 2.10243934 -0.029456247
GSM2040795 3.51302162 2.433177649 3.94509797 1.66007034 -1.796480269 -0.187367691 1.24883240 0.77722510 6.21053670 -2.041350212
GSM2040796 0.54993291 0.537027963 -2.19561426 -3.96703207 3.219152144 -1.443878902 1.57589196 -6.77609267 -0.52015914 -0.106125392
GSM2040797 1.72779540 -2.525183110 -4.60572750 1.50872746 1.240150743 2.328891024 -4.03371439 1.01721967 -4.53136037 2.616590118
PC80 PC81 PC82 PC83 PC84 PC85 PC86 PC87 PC88 PC89
GSM2040792 2.94940047 1.45065969 4.25510381 2.70134207 2.27196058 -4.6619410112 -0.51905734 -3.10468132 -1.91065331 0.26814515
GSM2040793 1.12390927 2.77943714 -0.38901438 -4.91285331 -3.71230463 1.8708272239 -2.51874412 0.56731580 1.28551201 0.58711482
GSM2040794 -1.19752553 0.72562919 -2.21676001 -1.24117406 -2.37514901 0.4501905268 1.01897916 0.53685941 -0.13102319 -0.24540684
GSM2040795 0.98377171 -0.37882791 -2.86254183 5.39864412 4.83689927 -6.1624890934 -3.53520242 -1.50021807 0.07693095 2.69542184
GSM2040796 -1.77326079 -3.71539372 -1.03657806 2.66402383 0.26403498 -3.4230130626 2.88053217 -2.25152995 1.38739592 1.30579876
GSM2040797 1.97516902 -0.60642179 2.14369239 -2.89021922 0.76218166 0.8385516984 -0.48562018 -0.23581874 1.69989507 -0.21632598
PC90 PC91 PC92 PC93 PC94 PC95 PC96 PC97 PC98 PC99
GSM2040792 0.150785962 5.35024617 1.238598889 1.51962386 1.32827505 -0.61698971 -4.694927454 -1.63307930 -3.354701233 -0.820930771
GSM2040793 -0.496032018 -1.91041616 -0.595014571 -0.43531186 0.66849683 -4.78154786 -0.312459142 3.33582981 3.390999033 2.508644894
GSM2040794 -1.926625527 2.09296747 1.735039274 -0.56890228 -2.59441477 4.50755540 1.630598052 1.54893177 -1.604464745 -0.191274205
GSM2040795 -0.006653669 1.23685232 -1.425608464 0.56957728 1.93960905 0.36094282 -3.261137318 -2.10517125 -0.771206239 -2.258108346
GSM2040796 0.986391128 0.52209776 0.533669346 3.07016874 1.26500556 -3.06613183 0.087194941 -0.20593220 -0.993841377 1.191137954
GSM2040797 2.245443500 -1.75433043 -2.502589786 1.32172803 -0.49885017 1.64105082 -2.244264297 -6.05185155 -2.797556778 -3.248640961
PC100 PC101 PC102 PC103 PC104 PC105 PC106 PC107 PC108 PC109
GSM2040792 1.49803201 3.217761801 1.348402535 0.43250281 2.15621172 0.767011659 -2.33475658 -1.42594421 -2.534271702 -1.09162947
GSM2040793 1.37079125 -4.920532908 -1.082084794 -0.29062168 0.56087870 -0.177253835 -0.07545270 -2.60172704 0.986991969 2.70454442
GSM2040794 -2.75536720 -0.182093099 -0.005725923 -0.91370932 -3.52212402 1.595803904 -1.07059681 -1.13292000 1.525563082 -1.15144364
GSM2040795 -0.54730007 -0.167104147 -0.677983459 2.23549166 2.01120222 1.262658204 -0.92563688 -1.21511452 -2.054460298 0.28996766
GSM2040796 -1.74404176 -2.013743836 0.476091287 -1.90401924 -1.84090672 0.484518746 1.69598539 -0.51832157 1.840808253 -1.99119246
GSM2040797 1.25502680 -2.693639149 3.139903325 3.07645072 -0.76372214 0.448582227 0.39474253 -0.14867615 3.904927379 4.97813687
PC110 PC111 PC112 PC113 PC114 PC115 PC116 PC117 PC118 PC119
GSM2040792 -2.81067638 -1.730405644 -0.30041646 0.03215978 4.3988898016 0.32888361 0.26548391 2.2295662120 -2.87510240 -0.69961408
GSM2040793 -0.41453832 1.438990450 -2.94153128 2.63979176 1.1462514423 -0.26291984 3.03015323 0.1418160182 0.28589403 -1.73602474
GSM2040794 -0.16983819 0.741169934 1.16767985 -2.01462841 -0.3601207738 -0.01058388 0.26274042 0.5714098824 0.68516101 4.58154707
GSM2040795 1.98200264 -0.972256984 0.50136930 -0.87597409 -1.3897920958 -0.97891321 -2.11043161 -2.4378800144 -0.24452715 -3.50653730
GSM2040796 0.79548922 -2.818500397 0.43467528 2.08477616 0.4253228914 0.11470259 -0.08901078 -0.7697083429 -0.57899397 0.61388726
GSM2040797 2.73738163 0.502888382 1.38042997 1.69540793 1.4057161883 1.32974203 -1.43833975 0.6146140825 -2.56467885 1.49616738
PC120 PC121 PC122 PC123 PC124 PC125 PC126 PC127 PC128 PC129
GSM2040792 -0.76863504 -0.05381478 -1.58223700 -0.41196309 -0.31160382 -1.07570096 2.30714835 1.50267352 0.41021709 1.122126930
GSM2040793 3.86936444 -0.90889618 0.59623781 1.61749995 -1.33862361 -1.95652754 -1.35066279 -1.41496757 -1.84549177 -4.583255974
GSM2040794 -1.43708033 1.35646446 -0.99059507 0.52125723 -0.03147995 0.63482666 0.54297132 -0.05615791 4.72600899 6.720015114
GSM2040795 -3.80902710 1.21850498 -0.68636333 1.65945191 0.92111640 0.12755215 -1.09636829 0.46975912 -1.61691615 -1.747402359
GSM2040796 1.12549255 -1.93336254 -1.10816758 -0.94290299 0.99973915 -0.47709612 0.73071503 0.88063018 0.26715576 0.366515884
GSM2040797 1.26560977 0.23331317 0.92893860 0.97265047 -1.83542895 1.00191092 -0.32009314 2.26290168 -0.97079916 -0.688989960
PC130 PC131 PC132 PC133 PC134 PC135 PC136 PC137 PC138 PC139
GSM2040792 0.05243377 1.719511088 0.92849005 -2.41015983 -1.58230720 -1.873445326 1.556663389 2.341496504 0.5409756322 1.107048448
GSM2040793 3.68476720 -1.188416751 -2.06667566 0.35305297 0.35051252 -0.545775822 -1.508882395 -0.475946880 1.4095726465 4.430376940
GSM2040794 -3.98241389 1.546582704 1.72536637 -1.97355632 -1.44126364 3.373380563 1.043928675 -1.565442623 -0.7701613743 -6.742326725
GSM2040795 1.90687166 -2.651946683 2.62416316 1.30311240 -1.07012465 -3.032178789 0.296002652 0.786847157 -1.7186375905 3.156957470
GSM2040796 1.64981959 -0.033204956 0.54512481 0.89359139 -0.23015435 0.039849359 0.830302054 -0.527528446 -0.3663443956 -0.209927867
GSM2040797 0.98662215 -1.129994025 -1.81917664 -0.50749356 0.46235582 -0.681593549 1.067604775 0.077288856 0.0581103928 -0.134045443
PC140 PC141 PC142 PC143 PC144 PC145 PC146 PC147 PC148 PC149
GSM2040792 0.18215091 -2.17447518 -1.09930366 0.492231126 0.009040347 0.33278162 -0.568080735 0.57965511 -0.21585714 0.497396786
GSM2040793 2.08637237 0.31218333 0.41750846 2.782575306 0.478427320 1.25062191 -3.815725639 0.28625798 0.00884119 0.076060482
GSM2040794 -1.77022914 -0.67808501 0.18961429 -3.954743351 -3.479171264 -2.00247052 5.142802250 -0.52076025 0.67425691 0.636811379
GSM2040795 -1.18386750 2.03517489 0.08120949 -0.579022448 1.328282550 1.56928843 -1.142621862 0.81543883 0.24435047 0.459989874
GSM2040796 0.10281617 0.55787360 -0.65440099 0.254577422 -0.191324042 -0.23692024 0.745416084 -0.25460850 1.02182318 0.004068692
GSM2040797 1.54842975 -0.79174253 0.44800061 -2.053983481 2.192225730 -1.62562249 -0.582344326 -0.41507597 0.05489578 -0.291139968
PC150 PC151
GSM2040792 -1.918155e-02 -4.648163e-13
GSM2040793 3.986442e-02 -4.722894e-13
GSM2040794 -5.539206e-02 -4.719866e-13
GSM2040795 -3.648964e-02 -4.682413e-13
GSM2040796 -1.134902e-01 -4.329052e-13
GSM2040797 1.299697e-01 -4.453689e-13
[ reached getOption("max.print") -- omitted 145 rows ]
pc2_scores <- pca_noscale$x[, 2] #So extract PC2:
#Step 2 — Compare PC1 between groups
#We do a simple t-test, Why a t-test? Because: PC2 is continuous, group is binary (COPD vs Control), we are testing mean difference
t.test(pc2_scores ~ group)
Welch Two Sample t-test
data: pc2_scores by group
t = -6.519, df = 103.9, p-value = 2.602e-09
alternative hypothesis: true difference in means between group Control and group COPD is not equal to 0
95 percent confidence interval:
-46.74392 -24.93842
sample estimates:
mean in group Control mean in group COPD
-26.34682 9.49435
#Step 3 — Visualize it
library(ggplot2)
df_pc2 <- data.frame(
PC2 = pc2_scores,
group = group
)
ggplot(df_pc2, aes(group, PC2, fill = group)) +
geom_boxplot(alpha = 0.6) +
theme_minimal() +
labs(title = "PC2 Distribution by Disease Status")
PCA showed that PC1 explains most variance but isn’t associated with COPD status, suggesting dominant variability is driven by other factors. However, PC2 is strongly associated with COPD (p ≈ 2.6×10⁻⁹), indicating a robust disease-related transcriptional axis despite overall heterogeneity.
Even though PC1 explains ~60% of overall variance, it’s not disease-associated (your PC1 p ≈ 0.87). But PC2 is disease-associated → the COPD signal is real, just not the largest source of variation in the dataset.
That is super common in human lung tissue:
-> PC1 often captures big background effects (cell-type mix, batch, smoking, RNA quality, inflammation level, etc.). -> PC2 can capture a more specific phenotype axis (here: COPD vs control).
#Differential Expression Analysis (limma) What we are doing We are testing, gene by gene, whether expression differs between COPD and Control.
Why we are doing it ->PCA tells us global structure. ->limma tells us which specific genes drive disease differences.
This is the step that produces candidate biomarkers.
library(limma)
# Step 1 — Build the design matrix
design <- model.matrix(~ group)
colnames(design)
[1] "(Intercept)" "groupCOPD"
head(design)
(Intercept) groupCOPD
1 1 0
2 1 0
3 1 0
4 1 0
5 1 1
6 1 1
# Step 2 — Fit gene-wise linear models
fit <- lmFit(expr_log, design)
# Step 3 — Apply empirical Bayes moderation
fit <- eBayes(fit)
# Step 4 — Extract results for COPD effect
results <- topTable(
fit,
coef = "groupCOPD",
number = Inf,
adjust.method = "BH"
)
head(results)
# Step 5 — Count significant genes
sum(results$adj.P.Val < 0.05)
[1] 1812
library(ggplot2)
results$significant <- results$adj.P.Val < 0.05
ggplot(results, aes(x = logFC, y = -log10(adj.P.Val))) +
geom_point(aes(color = significant), alpha = 0.7, size = 2.5) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "gray40", linewidth = 0.7) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "gray40", linewidth = 0.7) +
scale_color_manual(
values = c("FALSE" = "steelblue", "TRUE" = "firebrick"),
labels = c("FALSE" = "Not Significant", "TRUE" = "Significant")
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16), # centered title
axis.title = element_text(face = "bold"), # bold axis labels
legend.title = element_text(face = "bold"), # bold legend title
legend.position = "right",
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "white", color = NA)
) +
labs(
title = "Differential Expression — COPD vs Control",
x = "Log2 Fold Change",
y = "-log10(FDR)",
color = "Significance"
)
#Identify Top Up- and Down-Regulated Genes
top_up <- results[results$logFC > 0 & results$adj.P.Val < 0.05, ][1:10, ]
top_down <- results[results$logFC < 0 & results$adj.P.Val < 0.05, ][1:10, ]
top_up
top_down
cat("I performed limma differential expression modelling using COPD status as the predictor. After empirical Bayes moderation and FDR correction, I identified 1812 genes significantly associated with COPD. The volcano plot demonstrated both upregulated and downregulated transcriptional programs, consistent with widespread molecular remodeling in diseased lung tissue.")
I performed limma differential expression modelling using COPD status as the predictor. After empirical Bayes moderation and FDR correction, I identified 1812 genes significantly associated with COPD. The volcano plot demonstrated both upregulated and downregulated transcriptional programs, consistent with widespread molecular remodeling in diseased lung tissue.
After QC and PCA exploration, I performed supervised differential expression using limma. I modeled gene expression as a function of COPD status, applied empirical Bayes moderation, controlled FDR using Benjamini-Hochberg, and identified significantly dysregulated genes associated with COPD.
# What we're doing: keep genes with FDR + decent effect size
# Why: avoids “statistically significant but tiny” changes
sig_strong <- results[results$adj.P.Val < 0.05 & abs(results$logFC) > 0.5, ]
nrow(sig_strong)
[1] 1171
head(sig_strong)
NA
# What we’re doing: Convert rownames (ILMN_...) into an explicit column.
# Why: We need ProbeID to merge with annotation tables.
sig_strong$ProbeID <- rownames(sig_strong)
head(sig_strong$ProbeID)
[1] "ILMN_1676938" "ILMN_3284222" "ILMN_1813131" "ILMN_3256445" "ILMN_3252936" "ILMN_2168866"
IMPORTANT: Use Symbol (not ILMN_Gene). And also grab Entrez_Gene_ID directly (more robust).
library(GEOquery)
library(dplyr)
gpl <- getGEO("GPL10558")
gpl_tab <- Table(gpl)
# Build mapping table
annot_map <- gpl_tab %>%
transmute(
ProbeID = as.character(ID),
SYMBOL = as.character(Symbol),
ENTREZID = as.character(Entrez_Gene_ID)
)
# Clean up symbol + entrez
annot_map$SYMBOL <- trimws(annot_map$SYMBOL)
annot_map$ENTREZID <- trimws(annot_map$ENTREZID)
annot_map$SYMBOL[annot_map$SYMBOL %in% c("", "NA")] <- NA
annot_map$ENTREZID[annot_map$ENTREZID %in% c("", "NA")] <- NA
# If SYMBOL contains multi-maps like "A /// B" or "A;B", keep first
annot_map$SYMBOL <- sub("///.*$", "", annot_map$SYMBOL)
annot_map$SYMBOL <- sub(";.*$", "", annot_map$SYMBOL)
annot_map$SYMBOL <- trimws(annot_map$SYMBOL)
head(annot_map)
sum(!is.na(annot_map$SYMBOL))
[1] 44837
sum(!is.na(annot_map$ENTREZID))
[1] 43960
annotated <- sig_strong %>%
left_join(annot_map, by = "ProbeID")
# sanity checks
head(annotated[, c("ProbeID","SYMBOL","ENTREZID","logFC","adj.P.Val")])
mean(!is.na(annotated$SYMBOL))
[1] 0.9897523
mean(!is.na(annotated$ENTREZID))
[1] 0.9897523
table(is.na(annotated$SYMBOL))
FALSE TRUE
1159 12
If mean(!is.na(annotated$SYMBOL)) is low (<0.7), we can still proceed using ENTREZIDs if those map well.
# Upregulated in COPD
entrez_up <- annotated %>%
filter(logFC > 0) %>%
pull(ENTREZID) %>%
na.omit() %>%
unique()
# Downregulated in COPD
entrez_down <- annotated %>%
filter(logFC < 0) %>%
pull(ENTREZID) %>%
na.omit() %>%
unique()
length(entrez_up)
[1] 69
length(entrez_down)
[1] 1045
head(entrez_up)
[1] "3357" "761" "348" "6507" "6363" "2266"
head(entrez_down)
[1] "649214" "391359" "643431" "100130289" "344866" "81688"
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
for (p in c("clusterProfiler", "org.Hs.eg.db", "enrichplot")) {
if (!requireNamespace(p, quietly = TRUE)) {
BiocManager::install(p, ask = FALSE, update = FALSE)
}
}
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
if (length(entrez_up) < 10) stop("Too few upregulated genes for enrichment. Relax thresholds.")
ego_up <- enrichGO(
gene = entrez_up,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE
)
head(as.data.frame(ego_up))
NA
if (length(entrez_down) < 10) stop("Too few downregulated genes for enrichment. Relax thresholds.")
ego_down <- enrichGO(
gene = entrez_down,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE
)
head(as.data.frame(ego_down))
NA
library(ggplot2)
# Upregulated GO dotplot
dotplot(ego_up, showCategory = 15) +
ggtitle("GO BP — Upregulated in COPD") +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
axis.title = element_text(face = "bold"),
axis.text.y = element_text(face = "bold", size = 10),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
# Downregulated GO dotplot
dotplot(ego_down, showCategory = 15) +
ggtitle("GO BP — Downregulated in COPD") +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
axis.title = element_text(face = "bold"),
axis.text.y = element_text(face = "bold", size = 10),
axis.text.x = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
write.csv(annotated, "limma_sig_strong_annotated.csv", row.names = FALSE)
write.csv(as.data.frame(ego_up), "GO_up_COPD.csv", row.names = FALSE)
write.csv(as.data.frame(ego_down), "GO_down_COPD.csv", row.names = FALSE)
What we are doing We’re checking if the limma results behave like real biology: ->Volcano plot: effect size vs significance ->MA plot: mean expression vs logFC
Why we are doing it If you see weird shapes (e.g., everything significant, or only low-expression genes), it often means: ->mapping issues ->normalization issues ->hidden batch/covariate effects
library(ggplot2)
library(dplyr)
# Ensure results has ProbeID
res_df <- results %>%
mutate(ProbeID = rownames(results),
negLog10P = -log10(P.Value),
sig = adj.P.Val < 0.05 & abs(logFC) > 0.5)
# Volcano
ggplot(res_df, aes(x = logFC, y = negLog10P)) +
geom_point(aes(alpha = sig), size = 1.2) +
theme_minimal(base_size = 13) +
labs(title = "Volcano Plot (COPD vs Control)",
x = "log2 Fold Change",
y = "-log10(p-value)") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# MA plot
ggplot(res_df, aes(x = AveExpr, y = logFC)) +
geom_point(aes(alpha = sig), size = 1.2) +
geom_hline(yintercept = 0, linetype = "dashed") +
theme_minimal(base_size = 13) +
labs(title = "MA Plot (COPD vs Control)",
x = "Average Expression",
y = "log2 Fold Change") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
What we are doing We’ll visualise the top differentially expressed genes across all samples.
Why we are doing it A heatmap shows: ->whether samples cluster by COPD vs Control ->whether the “signature” is consistent or noisy ->whether outliers exist
# We'll use pheatmap for a clean heatmap
if (!requireNamespace("pheatmap", quietly = TRUE)) install.packages("pheatmap")
library(pheatmap)
# Pick top genes by FDR
topN <- 50
top_probes <- rownames(results[order(results$adj.P.Val), ])[1:topN]
# expr_log should already exist (genes x samples, log2 transformed)
mat_top <- expr_log[top_probes, ]
# Z-score per gene for heatmap visibility
mat_z <- t(scale(t(mat_top)))
ann_col <- data.frame(Group = group)
rownames(ann_col) <- colnames(mat_z)
pheatmap(
mat_z,
annotation_col = ann_col,
show_colnames = FALSE,
fontsize_row = 7,
main = paste("Top", topN, "DE Probes (Z-scored)")
)
What we are doing We inspect GEO phenotype columns (age/sex/smoking, etc.) and see if they align with PC1/PC2 or with group imbalance.
Why we are doing it ->Your PCA already said: PC1 is not COPD, PC2 is COPD. ->That screams: “PC1 might be smoking, batch, tissue quality, cell composition, etc.”
# Quick scan for useful phenotype fields
colnames(pheno)
[1] "title" "geo_accession" "status" "submission_date" "last_update_date"
[6] "type" "channel_count" "source_name_ch1" "organism_ch1" "characteristics_ch1"
[11] "characteristics_ch1.1" "characteristics_ch1.2" "characteristics_ch1.3" "characteristics_ch1.4" "characteristics_ch1.5"
[16] "characteristics_ch1.6" "characteristics_ch1.7" "characteristics_ch1.8" "characteristics_ch1.9" "characteristics_ch1.10"
[21] "characteristics_ch1.11" "characteristics_ch1.12" "treatment_protocol_ch1" "growth_protocol_ch1" "molecule_ch1"
[26] "extract_protocol_ch1" "label_ch1" "label_protocol_ch1" "taxid_ch1" "hyb_protocol"
[31] "scan_protocol" "description" "description.1" "data_processing" "platform_id"
[36] "contact_name" "contact_department" "contact_institute" "contact_address" "contact_city"
[41] "contact_state" "contact_zip/postal_code" "contact_country" "supplementary_file" "data_row_count"
[46] "age:ch1" "bmi:ch1" "copd:ch1" "fev1.pp:ch1" "fev1fvc:ch1"
[51] "ID:ch1" "laa950:ch1" "packyears:ch1" "perc15:ch1" "pi10:ch1"
[56] "race:ch1" "Sex:ch1" "tissue:ch1"
# Look at characteristics fields (GEO often stores info here)
pheno %>%
dplyr::select(title, geo_accession, dplyr::starts_with("characteristics")) %>%
head(10)
# Example: check if any characteristics correlate with PC1 (if numeric exists)
# (You'll customise once you see what columns exist.)
What we are doing Run Reactome pathway enrichment on up/down genes.
Why we are doing it Reactome pathways are often more interpretable than GO terms for disease biology.
# Reactome enrichment uses Entrez IDs
if (!requireNamespace("ReactomePA", quietly = TRUE)) BiocManager::install("ReactomePA", ask = FALSE, update = FALSE)
library(ReactomePA)
react_up <- enrichPathway(
gene = entrez_up,
organism = "human",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE
)
react_down <- enrichPathway(
gene = entrez_down,
organism = "human",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE
)
head(as.data.frame(react_up))
head(as.data.frame(react_down))
dotplot(react_up, showCategory = 15) + ggtitle("Reactome — Up in COPD")
dotplot(react_down, showCategory = 15) + ggtitle("Reactome — Down in COPD")
length(genes_down)
[1] 1058
entrez_down <- bitr(
genes_down,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db
)
nrow(entrez_down) # how many mapped
[1] 609
length(unique(entrez_down$ENTREZID))
[1] 609
head(entrez_down)
What we are doing We rank all genes by limma t-statistic and perform enrichment without arbitrary cutoffs. Why ->Over-representation can miss coordinated but subtle pathway shifts. ->GSEA is more sensitive and biologically robust.
# Create ranked gene list (by limma t-statistic)
# Create ranked gene list (by limma t-statistic)
gene_list <- results$t
# Use FULL mapping from GPL (not sig-only "annotated")
entrez_map_full <- annot_map %>%
dplyr::select(ProbeID, ENTREZID) %>%
dplyr::filter(!is.na(ENTREZID))
names(gene_list) <- entrez_map_full$ENTREZID[match(rownames(results), entrez_map_full$ProbeID)]
# Clean
gene_list <- gene_list[is.finite(gene_list)]
gene_list <- gene_list[!is.na(names(gene_list))]
gene_list <- sort(gene_list, decreasing = TRUE)
gene_list <- gene_list[!duplicated(names(gene_list))]
# sanity
all(diff(gene_list) <= 0)
[1] TRUE
length(gene_list)
[1] 15718
head(gene_list)
3357 761 348 6507 6363 2266
5.038668 4.687792 4.685257 4.561199 4.363037 4.332848
#Step 1 — Check structure
head(gene_list)
3357 761 348 6507 6363 2266
5.038668 4.687792 4.685257 4.561199 4.363037 4.332848
length(gene_list)
[1] 15718
summary(gene_list)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-6.1865 -0.3819 0.6384 0.3361 1.3721 5.0387
# Step 2 — Ensure sorted
# TRUE means it's already sorted decreasing
is_sorted_decreasing <- all(diff(gene_list) <= 0, na.rm = TRUE)
is_sorted_decreasing
[1] TRUE
all(diff(gene_list) <= 0, na.rm = TRUE)
[1] TRUE
#Quick full “pre-GSEA clean” block
# Make sure it's numeric and named
stopifnot(is.numeric(gene_list))
stopifnot(!is.null(names(gene_list)))
# Drop NA/Inf
gene_list <- gene_list[is.finite(gene_list)]
# Ensure names are characters (Entrez IDs)
names(gene_list) <- as.character(names(gene_list))
# Remove duplicates (keep first / best-ranked after sorting)
gene_list <- sort(gene_list, decreasing = TRUE)
gene_list <- gene_list[!duplicated(names(gene_list))]
# Final check: sorted decreasing?
all(diff(gene_list) <= 0, na.rm = TRUE)
[1] TRUE
What we’re doing We’re running rank-based pathway enrichment using the full limma signal (t-stat ranking), not just “sig genes”.
Why ORA (enrichPathway) depends on an arbitrary cutoff. GSEA catches coordinated shifts even if single genes aren’t extreme.
# Install if missing
if (!requireNamespace("ReactomePA", quietly = TRUE)) {
BiocManager::install("ReactomePA", ask = FALSE, update = FALSE)
}
if (!requireNamespace("enrichplot", quietly = TRUE)) {
BiocManager::install("enrichplot", ask = FALSE, update = FALSE)
}
library(ReactomePA)
library(enrichplot)
# GSEA (Reactome) using ranked t-stat list
gsea_reactome <- gsePathway(
geneList = gene_list, # named numeric vector: names = EntrezID
organism = "human",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
verbose = FALSE
)
# View results
head(as.data.frame(gsea_reactome))
# Plot GSEA results
# If no significant pathways, don't crash
if (is.null(gsea_reactome) || nrow(as.data.frame(gsea_reactome)) == 0) {
cat("No significant Reactome GSEA pathways at FDR < 0.05.\nTry pvalueCutoff=0.1 or check mapping.\n")
} else {
dotplot(gsea_reactome, showCategory = 15) +
ggtitle("Reactome GSEA — Ranked by limma t-stat") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
}
gsea_df <- as.data.frame(gsea_reactome)
# Top positively enriched (COPD side, depending on your ranking direction)
head(gsea_df[order(gsea_df$p.adjust),
c("ID","Description","NES","p.adjust","core_enrichment")], 10)
# Strongest positive NES
head(gsea_df[order(-gsea_df$NES), c("Description","NES","p.adjust")], 10)
# Strongest negative NES
head(gsea_df[order(gsea_df$NES), c("Description","NES","p.adjust")], 10)
NA
# Pick the top pathway by adjusted p-value
top_path <- as.data.frame(gsea_reactome)$ID[1]
# Enrichment curve
gseaplot2(gsea_reactome, geneSetID = top_path, title = as.data.frame(gsea_reactome)$Description[1])
gsea_df <- as.data.frame(gsea_reactome)
gsea_df %>%
dplyr::arrange(p.adjust) %>%
dplyr::select(Description, NES, p.adjust) %>%
head(10)
Based on everything you’ve shown so far, your COPD signature likely has:
Upregulated: ->ECM organization ->Collagen degradation ->Integrin interactions ->ROBO signaling ->Inflammatory pathways
Downregulated: ->RNA splicing ->Ribosomal biogenesis ->Translation machinery
Rank-based Reactome GSEA revealed coordinated enrichment of tissue remodeling and SLIT–ROBO signaling pathways among genes upregulated in COPD lung tissue, consistent with structural reorganization and altered epithelial–stromal interactions. In contrast, multiple translational and ribosomal biogenesis pathways were enriched at the downregulated end of the ranked list, indicating suppression of core protein synthesis and RNA processing programs. Together, these findings suggest that COPD is characterized by a shift from normal epithelial biosynthetic activity toward chronic remodeling and stress-associated signaling programs.